The main stuff will be done here!
For this, I will use mysql, which I will host locally. This needs to be runned for pymssql to work. After that I can make queries with pymssql when I connect to my local hosted server. This makes development easier. As I use a different machine from last time when I used these, I have to install them again.
Mysql local install:
PATHmysql -V in terminal if it is installed correctly (restart my be needed needed)On windows, it creates a very okeyish config file with reasonable variables.
On linux, there is no need to be added to path, but the config file is empty.
pymssql:
pip install -u pymsql.ENV
It is a good practice to create a new user with granted permissions (don't use root), but it's necessary to hide these credentials if the work is shared: create an .env file that has the information and block its upload to github!
Helpful stuff
https://www.digitalocean.com/community/tutorials/a-basic-mysql-tutorial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
import sys
import os
import pathlib
import glob
import pymssql
import mysql.connector
from mysql.connector import Error
from mysql.connector import OperationalError, ProgrammingError
from dotenv import load_dotenv
load_dotenv()
Usage of .env is essential in hidin credenials
usr = os.getenv('user')
psswrd = os.getenv('password')
Let's test i this works
#how to use it
"""
# This opens a connection to the MS SQL server
conn = pymssql.connect(user=usr,
password=psswrd,
host='localhost',
database='sys')
cursor = conn.cursor()
# A test query
res = cursor.execute('select * from shot;')
print(cursor.fetchone())
"""
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'sys',
use_pure=True)
db_cursor = db_connection.cursor()
db_cursor.execute('select* from session;')
for db in db_cursor:
print(db)
db_connection.close()
db_cursor.close()
Now I just need to create this course's database and create the table that has the .csv loaded in...
load data infile 'data\_linklist.csv'
into table linklist
fields terminated by ','
lines terminated by '\n'
ignore 1 rows;
THe problem is that the setup file has 2 rows specifying where the mysql server can take files from. One of them prevents local file insertion, the other gives a strict path.
But this needs a well defined table beforehand. With these, the first 3 task is complete:
decoder0 = pd.read_csv("data/_decoding.csv", sep=",", encoding='cp1252')
decoder1 = pd.read_csv("data/_decoding.csv", sep=",", encoding='cp1252')
decoder0
for i in range(len(decoder1['uuid'])):
decoder0['uuid'].loc[i] = re.sub(r'\r', '', decoder0['uuid'].loc[i]).strip()
decoder1['uuid'].loc[i] = re.sub(r'\r', '', decoder1['uuid'].loc[i]).strip()
decoderd = dict(zip(decoder1['uuid'], decoder1['character']))
Visualizing the networks with the nodes and links. Nodes that are more connected shall be bigger.
import networkx as nx
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=True)
Llist = []
count=0
db_cursor = db_connection.cursor()
db_cursor.execute('select* from linklist;')
for db in db_cursor:
Llist.append(db)
count=count+1
print(count)
db_connection.close()
db_cursor.close()
Llist[1]
It's strange that the \r is present in the returned result of the query. Regex can remove, but I have never seen such thing before. Maybe due to the loading?
for i in range(len(Llist)):
sub = list(Llist[i])
sub[1] = re.sub(r'\r', '', sub[1]).strip()
Llist[i] = tuple(sub)
Llist_a = np.array(Llist)
Llist[1]
# GRAPH
G = nx.Graph()
G.add_edges_from(Llist)
G.number_of_nodes(), G.number_of_edges()
This is less than the length of the linklist. Does it know that these eddes are weighted?
sum_node_deg = 0
for i in range(len(G.degree())):
sum_node_deg = sum_node_deg + list(list(G.degree)[i])[1]
print(sum_node_deg)
It knows. Yes!
#Let's draw this
fig = plt.figure(figsize=(14,14))
plt.title('Graph of the Acquaintances\n(Spring layout)', fontsize=26)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G,
pos,
nodelist = np.array(G.nodes),
cmap=plt.get_cmap('jet'),
#node_color = values,
node_size = np.around(np.array(np.array(G.degree)[:,1], dtype=np.uint32)/np.max(np.array(np.array(G.degree)[:,1], dtype=np.uint32))*200))
nx.draw_networkx_edges(G,
pos,
width=0.05,
#edgelist=red_edges,
edge_color='r',
arrows=False)
fig.tight_layout()
plt.show()
fig = plt.figure(figsize=(14,14))
plt.title('Graph of the Acquaintancesn\n(Kamada Kawai Layout)', fontsize=26)
pos = nx.kamada_kawai_layout(G)
nx.draw_networkx_nodes(G,
pos,
nodelist = np.array(G.nodes),
cmap=plt.get_cmap('jet'),
#node_color = values,
node_size = np.around(np.array(np.array(G.degree)[:,1], dtype=np.uint32)/np.max(np.array(np.array(G.degree)[:,1], dtype=np.uint32))*200))
nx.draw_networkx_edges(G,
pos,
width=0.05,
#edgelist=red_edges,
edge_color='r',
arrows=False)
fig.tight_layout()
plt.show()
fig = plt.figure(figsize=(14,14))
plt.title('Graph of the Acquaintances(Spiral Layout)', fontsize=26)
pos = nx.spiral_layout(G)
nx.draw_networkx_nodes(G,
pos,
nodelist = np.array(G.nodes),
cmap=plt.get_cmap('jet'),
#node_color = values,
node_size = np.around(np.array(np.array(G.degree)[:,1], dtype=np.uint32)/np.max(np.array(np.array(G.degree)[:,1], dtype=np.uint32))*200))
nx.draw_networkx_edges(G,
pos,
width=0.05,
#edgelist=red_edges,
edge_color='r',
arrows=False)
fig.tight_layout()
plt.show()
These are all interesting by their looks, but the spring layout seems to be the most accurate one to show the network.
Problem with labels
There is a dozen of nodes present in this network, labeling them would make it crazy hard to read. Interactive plots would solve this!
#import holoviews as hv always, holovws somehow fails
#import bokeh.io fails too
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
def make_edge(x, y, width):
return go.Scatter(x = x,
y = y,
line = dict(width = width,
color = 'red'),
hoverinfo = None,
mode = 'lines')
#getting the positions
pos = nx.spring_layout(G)
#getting edge trace
edge_trace = []
for edge in G.edges():
char_1 = edge[0]
char_2 = edge[1]
x0, y0 = pos[char_1]
x1, y1 = pos[char_2]
trace = make_edge([x0, x1, None], [y0, y1, None],
width = 0.3)
edge_trace.append(trace)
#creating node trace
node_trace = go.Scatter(x = [],
y = [],
hovertemplate = '<b>%{text}</b>',
text = [],
#textposition = "top center",
textfont_size = 8,
mode = 'markers',
hoverinfo = 'none',
marker = dict(color = [],
size = [],
line = None))
#getting node trace
for node in G.nodes():
x, y = pos[node]
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
node_trace['marker']['color'] += tuple(['cornflowerblue'])
node_trace['marker']['size'] += tuple([5*np.log(G.degree()[node]+1) ])
node_trace['text'] += tuple(['<b>' + decoderd[node] + '</b> : <b>' + str(G.degree()[node]) + '<b>'])
# Customize layout
layout = go.Layout(
paper_bgcolor='rgba(0,0,0,0)', # transparent background
plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
xaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
height = 800,
width = 800
)
# Create figure
fig = go.Figure(layout = layout)
# Add all edge traces
for trace in edge_trace:
fig.add_trace(trace)
# Add node trace
fig.add_trace(node_trace)
# Remove legend
fig.update_layout(showlegend = False)
# Remove tick labels
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)
# Show figure
fig.show()
Plotly express could make it better as it can handle pandas dataframes. Creating such dataframe could solve most of my hoverinfo issues. This does it for now!
To get the degree distribution of this undirected and weighted network, I just need to count the appearences of each node in the network and after that I need to count the appearences of such individual degrees.
#degree
command5 = ('SELECT distinct Ks.K_C, count(Ks.K_C) from '+
'('
'SELECT distinct APPS.UUID as K, count(APPS.UUID) as K_C from '
'((SELECT uuid1 as UUID FROM linklist)'+
'UNION ALL' +
'(SELECT uuid2 as UUID FROM linklist)) as APPS '+
'GROUP BY APPS.UUID ' +
'ORDER BY COUNT(APPS.UUID) DESC ' +
') as Ks '+
'GROUP BY Ks.K_C '+
'ORDER BY COUNT(Ks.K_C) '+
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query = []
db_cursor = db_connection.cursor()
db_cursor.execute(command5)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
fig = plt.figure(figsize=(10,10))
plt.title("Degree distribution", fontsize=24)
plt.plot(query[:,0], query[:,1], " bo", label="Degrees from linklist")
plt.yscale('log')
plt.xscale('log')
plt.xlabel("k (degree)", fontsize=16)
plt.ylabel("Appearence", fontsize=16)
fig.tight_layout()
plt.legend(loc='best', fontsize=18)
plt.grid()
plt.show()
This does not seem to be random be like the degree distribution obtain from Erdős-Rényi random graphs, and I think this is not random at all. Existence of hubs: high degree nodes
NOTE: for some reason, the degree in networkx Graph object and the degrees calculated from my linklist differs.
After some reading, it turns out that in SQL, the speed is not good for calculating such stuff.
For calculating the clustering coefficient of node i whih has k$_i$ degree and L links to is neighbors, the equation is simple: $$ C_i = \frac{2 L}{k_i \cdot (k_i-1)} $$ $$ < C_{avg} > = \frac{1}{N} \sum_i C_i $$
where $< C_{avg} >$ is the average clustering coefficient and N is the number of nodes. The global clustering coefficient is calculated in a different way:
$$ C = \frac{\text{number of closed triplets}}{ \text{number of all triplets}} $$Where a triplet is 3 nodes connected with 2 egdes (open) or with 3 edges (closed). My plan is to do this in python and do some tinkering in SQL. I didn't find anything about calculating this in SQL online, so my guess is that there are packeges that can handle this with ease.
SUGGESTION
Triangles and triples: triangle with 3 vertex and 3 edge, triple with 3 vertex with 2 edge. For a node $\frac{triangles}{triples}$ gives the $C_i$ we are looking for. But how to get triangles and triples?
My though is doing joins. If its possible to do 3 joins with the first and last value to match, its a triangle, if 3 joins are possible, then triangle.
WAIT. HOLD ON
What I could do this with doing 1 joins on linklist with itself. Then it will have 3 values, which are my triples (as I don't know if they form a triangle or not). With doing 4 joins, I can get triangles, which has to be closed, so their first and last values have to match. Counting the triples and triangles and dividing them could give me the global clustering coefficient.
How to do this with local clustering coefficient? It may be simple,but I only have to see if a row contains my value for the node and I just have to see how many triangles has it divided by how many triples has it.
SELF LOOPS
For some reason, self-loops existed in previous versions of the linklist. The problem is that if they exist, and I do self joins on the table, these self-loops could cause triangles that have 2 vertexes that are the same. with 3 joins, it would make it such that these results would have 3 endpoints of the same vertex, but passing the requirment of the first and last point of this 4 point values match, meaning its a closed loop of 2 points, which is invalid. Now its fixed.
END THOUGHTS
So the global clustering coefficient could be done like this: make the triangles and triples by joins (triangles by 2 inner joins but their first and last point has to match: this way I don't need to care about permutations of a possible 3 points that are a triangle) and count triangles and triples and divide them.
The local clustering coefficient could be done like this: create the triangles table with joins, see how many triangles has to node in it (doesn't have to be an endpoint) and then double it and divide by $k_i \cdot (k_i-1)$ as I can know the degree of the node before hand. The equation says that the links between neighbours should be taken twice, which means that the triangles should be taken twice too!
len(list(G.edges))
command6 = ('SELECT * '+
'FROM linklist L1, linklist L2 '+
'WHERE L1.uuid2 = L2.uuid1 ' +
'ORDER BY L1.uuid1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query = []
db_cursor = db_connection.cursor()
db_cursor.execute(command6)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
command6 = ('SELECT L1.uuid1 as uuid1, L1.uuid2 as uuid2, L2.uuid2 '+
'FROM linklist L1 '+
'INNER JOIN linklist L2 ON L1.uuid2 = L2.uuid1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query = []
db_cursor = db_connection.cursor()
db_cursor.execute(command6)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
command6 = ('SELECT L1.uuid1 as uuid1, L1.uuid2 as uuid2, L2.uuid2 as uuid3, L2.uuid2 as uuid4 '+
'FROM linklist AS L1 '+
'INNER JOIN linklist AS L2 ON L1.uuid2 = L2.uuid1 ' +
'INNER JOIN linklist AS L3 ON L2.uuid2 = L3.uuid1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query = []
db_cursor = db_connection.cursor()
db_cursor.execute(command6)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
We have everything here, but we don't need everything...
command6 = ('SELECT L3.uuid1 as uuid1, PROD.uuid1 as uuid2, PROD.uuid2 as uuid3, PROD.uuid3 as uuid4 ' +
'FROM linklist L3 ' +
'INNER JOIN ' +
'(' +
'SELECT L1.uuid1 as uuid1, L1.uuid2 as uuid2, L2.uuid2 as uuid3 '+
'FROM linklist L1 '+
'INNER JOIN linklist L2 ON L1.uuid2 = L2.uuid1 ' +
') AS PROD ' +
'ON L3.uuid2 = PROD.uuid3 ' +
'WHERE L3.uuid1 = PROD.uuid3 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query = []
db_cursor = db_connection.cursor()
db_cursor.execute(command6)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
This doesn't seem to work out for me. I just have to make triangles, with 4 vertexes where the last and first have to much.
Getting the reverse direction of the first connection leading to actually working method. Logic is that these links are undirected, leading to that both direction should be taken into account. By hand it seems that going through the reverse on only the first link leads to triangles being found. Its very interesting that after this, there is no need for reversed directional path.
FURTHER MANUAL TESTING
As it turns out, one bidirectional start is enough to find at least 1 triangles, but it does not balance different cyclical loops (as right or left is favored). Instead, it is favorable to allow all link joins to be bidirectional and balance them by the following factors:
The found triples are found in both walk direction (so 1,2,3 and 3,2,1 combination is found by these are identitical) so I will find 2 times more triples compared to how many is present. (even sorting out matcing endpoints, such like 1,2,1).
The found triangles are worse, because we can only use correct triples (1,2,1 is bad, but 1,2,3 is allowed) and with a bidirection link addition, we find 6 times more triangles that are present. (1,2,3,1 is the same as 2,3,1,2).
These have been tested on paper, by hand
query
com_test = ('SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2'
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
This is how to get triples
com_test = ('SELECT * ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2' +
') as TRIPLES'
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
This is how to get triangles.
com_test = ('SELECT * ' +
'FROM '
'('
'SELECT TRIPLES.tr1 tr1, TRIPLES.tr2 tr2 , TRIPLES.tr3 tr3, BIDIR3.u2 tr4 ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2 ' +
') as TRIPLES ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR3 ' +
'WHERE TRIPLES.tr3 = BIDIR3.u1 AND TRIPLES.tr1 = BIDIR3.u2 '
') as TRIANGLES '
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
com_test = ('SELECT DISTINCT NODES.uuid1 , COUNT(NODES.uuid1) '
'FROM ( ' +
'SELECT uuid1 FROM testlist UNION ALL SELECT uuid2 FROM testlist ' +
') as NODES ' +
'GROUP BY NODES.uuid1 '
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
com_test = ('SELECT uuid1 FROM testlist UNION SELECT uuid2 FROM testlist ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
com_test = ('SELECT DISTINCT NODES.n1, COUNT(*) ' +
'FROM '
'('
'SELECT TRIPLES.tr1 tr1, TRIPLES.tr2 tr2 , TRIPLES.tr3 tr3, BIDIR3.u2 tr4 ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2 ' +
') as TRIPLES ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR3 ' +
'WHERE TRIPLES.tr3 = BIDIR3.u1 AND TRIPLES.tr1 = BIDIR3.u2 '
') as TRIANGLES ' +
', ' +
'(SELECT uuid1 n1 FROM testlist UNION SELECT uuid2 FROM testlist) as NODES ' +
'WHERE NODES.n1 = TRIANGLES.tr1 OR NODES.n1 = TRIANGLES.tr2 OR NODES.n1 = TRIANGLES.tr3 ' +
'GROUP BY NODES.n1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
com_test = ('SELECT NnK.n1, NnK.k1, NnTR.ctr, 2*NnTR.ctr/(NnK.k1)/(NnK.k1 - 1) FROM ' +
'(' +
'SELECT DISTINCT NODES.n1, COUNT(*)/6 ctr ' +
'FROM ' +
'(' +
'SELECT TRIPLES.tr1 tr1, TRIPLES.tr2 tr2 , TRIPLES.tr3 tr3, BIDIR3.u2 tr4 ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2 ' +
') as TRIPLES ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM testlist UNION ALL ( SELECT uuid2, uuid1 FROM testlist ) ' +
') as BIDIR3 ' +
'WHERE TRIPLES.tr3 = BIDIR3.u1 AND TRIPLES.tr1 = BIDIR3.u2 '
') as TRIANGLES ' +
', ' +
'(SELECT uuid1 n1 FROM testlist UNION SELECT uuid2 FROM testlist) as NODES ' +
'WHERE NODES.n1 = TRIANGLES.tr1 OR NODES.n1 = TRIANGLES.tr2 OR NODES.n1 = TRIANGLES.tr3 ' +
'GROUP BY NODES.n1 ' +
') as NnTR ' +
'INNER JOIN ' +
'(SELECT DISTINCT NODES.n1 n1, COUNT(NODES.n1) k1 '
'FROM ( ' +
'SELECT uuid1 n1 FROM testlist UNION ALL SELECT uuid2 FROM testlist ' +
') as NODES ' +
'GROUP BY NODES.n1 ) as NnK ' +
'WHERE NnK.n1 = NnTR.n1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
I HAVE GONE THROUGH PAIN AND SUFFERING TO REALISE THAT THE 'ON' CLAUSE DOESN'T WORK THE WAY I HAVE IMAGINED
com_test = ('SELECT NnK.n1, NnK.k1, NnTR.ctr, 2*NnTR.ctr/(NnK.k1)/(NnK.k1 - 1) FROM ' +
'(' +
'SELECT DISTINCT NODES.n1, COUNT(*)/6 ctr ' +
'FROM ' +
'(' +
'SELECT TRIPLES.tr1 tr1, TRIPLES.tr2 tr2 , TRIPLES.tr3 tr3, BIDIR3.u2 tr4 ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2 ' +
') as TRIPLES ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR3 ' +
'WHERE TRIPLES.tr3 = BIDIR3.u1 AND TRIPLES.tr1 = BIDIR3.u2 '
') as TRIANGLES ' +
', ' +
'(SELECT uuid1 n1 FROM linklist UNION SELECT uuid2 FROM linklist) as NODES ' +
'WHERE NODES.n1 = TRIANGLES.tr1 OR NODES.n1 = TRIANGLES.tr2 OR NODES.n1 = TRIANGLES.tr3 ' +
'GROUP BY NODES.n1 ' +
') as NnTR ' +
'INNER JOIN ' +
'(SELECT DISTINCT NODES.n1 n1, COUNT(NODES.n1) k1 '
'FROM ( ' +
'SELECT uuid1 n1 FROM linklist UNION ALL SELECT uuid2 FROM linklist ' +
') as NODES ' +
'GROUP BY NODES.n1 ) as NnK ' +
'WHERE NnK.n1 = NnTR.n1 ' +
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
Let's quickly check is its okey
2*202/(30*29)
In theory this is a correct query giving the correct result.
LOCAL CLUSTERING COEFFICIENT
clusters = np.array([format(i,".15g") for i in query[:,3]], dtype = np.float32)
AVERAGE CLUSTERING COEFFICIENT
np.mean(clusters)
GLOBAL CLUSTERING COEFFICIENT
com_test = ('SELECT trigs.triangleC / (trips.tripleC ) ' +
'FROM ' +
'('
'SELECT COUNT(*) tripleC ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2' +
') as TRIPLES ' +
') as trips' +
', ' +
'('
'SELECT COUNT(*) triangleC ' +
'FROM '
'('
'SELECT TRIPLES.tr1 tr1, TRIPLES.tr2 tr2 , TRIPLES.tr3 tr3, BIDIR3.u2 tr4 ' +
'FROM ' +
'('
'SELECT BIDIR1.u1 tr1, BIDIR1.u2 tr2, BIDIR2.u2 tr3 FROM ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR1 ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR2 ' +
'WHERE BIDIR1.u2 = BIDIR2.u1 AND BIDIR1.u1 != BIDIR2.u2 ' +
') as TRIPLES ' +
'INNER JOIN ' +
'( ' +
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL ( SELECT uuid2, uuid1 FROM linklist ) ' +
') as BIDIR3 ' +
'WHERE TRIPLES.tr3 = BIDIR3.u1 AND TRIPLES.tr1 = BIDIR3.u2 '
') as TRIANGLES ' +
') as trigs'
';'
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query = format(query[0][0], ".15g")
print('The global clustering coefficient is: {}'.format(query))
This is much easier: I have to sum up the neighbors degrees and divide it with the node degree. Indeed, as
$$ k_{nn}(k_i) = \frac{1}{k_i} \sum_{j=1}^N A_{ij} k_j $$where $k_i$, $k_j$ is a degree appearing in the network. The approach is straightforward: get the different degrees of the system, look through the linklist to see the nodes neightbours, and for a given node, save the sum of the degree of the node divided by the node of the degree. After that just sum up the distinct degree and partial results, and its done.
com_test = ('SELECT * ' +
'FROM ' +
#i need to get the degrees for each
'(SELECT DISTINCT NODES.n1 n1, COUNT(NODES.n1) k1 ' +
'FROM ( ' +
'SELECT uuid1 n1 FROM testlist UNION ALL SELECT uuid2 FROM testlist ' +
') as NODES ' +
') as NnK ' +
'; '
)
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query
com_test = ('SELECT DISTINCT (uKNN.KI), SUM(uKNN.KJ) ' +
'FROM ' +
'(' +
#this table now know the
'SELECT ST1nK1nEN.K1 KI, NnK.k1/ST1nK1nEN.K1 KJ ' +
'FROM ' +
'('
'SELECT NnK.n1 ST1, NnK.k1 K1, BIDIR1.u2 E1 ' +
'FROM ' +
#now I have to nodes and their degree
'(' +
'SELECT DISTINCT NODES.n1 n1, COUNT(NODES.n1) k1 ' +
'FROM (SELECT uuid1 n1 FROM linklist UNION ALL (SELECT uuid2 n1 FROM linklist)) as NODES ' +
'GROUP BY NODES.n1 ' +
') as NnK ' +
', ' +
'(' +
#I have to get the other point of the connections
'SELECT uuid1 u1, uuid2 u2 FROM linklist UNION ALL (SELECT uuid2 u1, uuid1 u2 FROM linklist) ' +
') as BIDIR1 ' +
'WHERE NnK.n1 = BIDIR1.u1 ' +
') as ST1nK1nEN '
', ' +
'(' +
'SELECT DISTINCT NODES.n1 n1, COUNT(NODES.n1) k1 ' +
'FROM (SELECT uuid1 n1 FROM linklist UNION ALL (SELECT uuid2 n1 FROM linklist)) as NODES ' +
'GROUP BY NODES.n1 ' +
') as NnK ' +
'WHERE ST1nK1nEN.E1 = NnK.n1 ' +
') as uKNN ' +
'GROUP BY uKNN.KI ' +
'ORDER BY uKNN.KI ASC ' +
';'
)
#is this an efficient way to do this? NO. is it good? MAYBE
#reason is for such queries is INNER JOIN or , for multiple doesnt matter, because ON for me doesn't work as intended
# recursive walk may be more efficient, but according to some research done by me, the JOIN and RECURSIVE WALK has almost
#identical speeds.
db_connection = mysql.connector.connect(user=usr,
passwd=psswrd ,
host='localhost',
database = 'datamodproj',
use_pure=False)
count=0
query=[]
db_cursor = db_connection.cursor()
db_cursor.execute(com_test)
for db in db_cursor:
query.append(list(db))
count=count+1
print(count)
query = np.array(query)
db_connection.close()
db_cursor.close()
query[:,1] = np.array([format(i, ".15g") for i in query[:,1]], dtype=np.float32)
NOTE: After I done it this way I realised that I could just replace the nodes with their degree, create the distinct k table and with these two I could have summed it up accourdingly and divide it with the degree.
from scipy.optimize import curve_fit as curve_fit
#definig power function
def power(x,a,b):
return b*x**a
popt, pcov = curve_fit(power, query[:,0], query[:,1])
XXX = np.linspace(query[0,0]*0.9, query[-1,0]*1.1, 20)
fig = plt.figure(figsize=(10,10))
plt.title("Degree Correlation of \n The Marvel's Cinematic Universe's Acquintance Network", fontsize=24)
plt.plot(query[:,0], query[:,1], " bo", label="Measure from MCU network")
l0 = "Fit: $k^{" + str(np.around(popt[0],4)) + "}$"
plt.plot(XXX, power(XXX, *popt), "--r", label=l0)
plt.xlabel("$k_i$", fontsize=28)
plt.ylabel("$k_{nn}$", fontsize=28)
plt.xscale("log")
plt.yscale("log")
plt.legend(loc="lower left", fontsize=22)
fig.tight_layout()
plt.grid()
plt.show()
CONCLUSION
The Marvel's Cinematic Universe's Acquitance network is clearly a disassortative one.